brunch data crawling

  • 작가들의 Followers 테이블을 만들다
  • Followers에서 following을 크롤링할 대상을 SQL 쿼리로 산출한다.
  • folling 테이블에 추가한다.

In [3]:
import sqlite3
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

import matplotlib.pylab as plt
%matplotlib inline

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
  1. 브런치 Follower(구독자), Following(관심작가) 정보 수집
    <ul class="list_follow">객체에서
    a tag href="/@"으로 시작하는 아이디 가져오기
    
  • crawling 절차
    • https://brunch.co.kr/@{user-id}/{following} URL로 HTML을 크롤링한다. : brunch 관심작가, 구독자가 공개는 되어 있지만, 무한 스크롤 방식의 UX를 사용하기 때문에 셀리늄2.0(webdriver)를 통해 html을 크롤링할수 있다.
    • 크롤링된 HTML을 BS4로 class = link_follow 이고 herf = '@**' 로 시작되는 아이디만 추출

In [4]:
# brunch data crawling by 셀레니엄
# source reference : http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
def crawlBrunchLink(uid, dir='follower', driver=webdriver.Firefox()):
    ## html crawling
    url = "https://brunch.co.kr/@{uid}/{dir}".format(uid=uid, dir=dir)
    driver.get(url)

    htmlsize = 0
    keep_cnt = 0
    for i in range(1,200):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.003) 
        if htmlsize == len(driver.page_source):
            keep_cnt += 1
        else :
            keep_cnt = 0
            htmlsize = len(driver.page_source)
        if keep_cnt > 5 :
            break
            
    html_source = driver.page_source
    ## extract follower, following data
    data = html_source.encode('utf-8')
    soup = BeautifulSoup(data, 'html.parser')
    classes = soup.find_all("a", class_="link_follow")
    idlist = []
    for c in classes:
        follwing = c.get('href')
        if follwing is None or len(follwing)==0:
            continue
        idlist.append(follwing[2:])

    #driver.close()
    return idlist

In [5]:
## 관심 작가 정보 크롤링 하기 
## extract Brunch Writer Info : uid, name, text-count, megazine-count, follower-count, following-count:
def extractWriterInfo(uid):
    try:
        response = requests.get("http://brunch.co.kr/@{uid}".format(uid=uid) )
    except Exception:
        []
        
    data = response.content.decode('utf-8')
    soup = BeautifulSoup(data, 'html.parser')
    
    ## name 
    names = soup.find_all("strong", class_="profileUserName")
    name = uid if len(names)<1 else names[0].getText()
    
    ## profile description
    desc = soup.find_all("pre", class_="profileUserDesc")
    desc = "{} 브런치입니다.".format(uid) if len(desc)<1 else desc[0].getText()
    
    ## thumbnail image link
    imgsrc = soup.find_all("input", class_="profileUserImageUrl")
    imgsrc = "no-img" if len(imgsrc)<1 else imgsrc[0].get('value')
    
    classes = soup.find_all("span", class_="num_count")
    reserved = [uid, name, desc, imgsrc]
    for c in classes:
        reserved.append(int(c.getText().replace(",","")))
    
    if len(reserved) < 8:
        for n in range(0,8-(len(reserved))):
            reserved.append(0)
    return reserved[:8]

데이터의 수집 방법

  • goodvc78의 follower부터 시작하여 재귀적으로 데이터 수집
  • goodvc78의 follower수집(1) --> follower들의 following 작가 수집(2) --> following 작가들의 follower수집(1) --> (2)

In [6]:
def insertData(tbl_name, columns, rows):
    conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
    col_str = ", ".join(columns)
    val_str = ", ".join(['?' for n in columns])

    sql = "insert into {tbl} ({cols}) values ({vals}) ".format(tbl=tbl_name, cols=col_str, vals=val_str)
    try:
        ret = conn.executemany(sql, rows)
    except Exception:
        conn.rollback()
    conn.commit()
    conn.close()

### sample code 
#now = (int(time.time()))
#rows = [['goodvc78', 'test1', now ],['goodvc78', 'test2', now ]]    
#insertData( 'follower_tbl', ['writerid', 'userid', 'tm'], rows)

In [7]:
def insertFollowings(base, id_list):
    now = (int(time.time()))
    rows = [[base, uid, now] for uid in id_list]
    insertData( 'following_tbl', ['userid', 'writerid', 'tm'], rows)
    
def insertFollowers(base, id_list):
    now = (int(time.time()))
    rows = [[base, uid, now] for uid in id_list]
    insertData( 'follower_tbl', ['writerid','userid',  'tm'], rows)

def insertWriterInfo(writer_info_list):
    now = (int(time.time()))
    rows = []
    for info in writer_info_list:
        info.append(now)
        rows.append(info)
    colnames = ['writerid', 'name', 'profile', 'imgsrc', 'documents', 'megazines', 'followers', 'followings', 'tm']
    insertData( 'writer_info_tbl', colnames, rows)

goodvc78's follower list 수집


In [6]:
## 1. goodvc78's follower crawling 
base = 'goodvc78'
driver = webdriver.Firefox()
base_follower = crawlBrunchLink(base, dir='follower', driver=driver)
print ("내가 좋아하는 작가의 followers = %d" %  len(base_follower) )
driver.close()


내가 좋아하는 작가의 followers = 176

In [52]:
## 2. goodvc78 follower list insert 
insertFollowers(base, base_follower)


inserted  176 <sqlite3.Cursor object at 0x108608ea0>

In [8]:
def unreadUserid(limit=100):
    conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
    sql = """ 
    select userid from follower_tbl 
    where userid not in ( select userid from following_tbl) limit {0};""".format(limit)
    
    ds = pd.read_sql(sql, conn)
    conn.close()
    return ds.userid.tolist()

In [9]:
def unreadWriterid(limit=10):
    conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
    sql = """ 
    select writerid,count(1) cnt from following_tbl
    where writerid not in ( select writerid from follower_tbl) and writerid !='brunch'
    group by writerid 
    having cnt > 20 
    limit {0};""".format(limit)
    
    ds = pd.read_sql(sql, conn)
    conn.close()
    return ds.writerid.tolist()

In [10]:
def unreadWriterInfoid(limit=100):
    conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
    sql = """ 
    select writerid, count(1) cnt from following_tbl
    where writerid not in ( select writerid from writer_info_tbl) and writerid !='brunch'
    group by writerid 
    having cnt > 1 
    limit {0};""".format(limit)
    
    ds = pd.read_sql(sql, conn)
    conn.close()
    return ds.writerid.tolist()

In [11]:
def crawlFollowing(limit=100):
    driver = webdriver.Firefox()
    users = unreadUserid(limit)
    print ("\ncrawling users ", len(users))
    for uid in users :
        following = crawlBrunchLink(uid, dir='following', driver=driver)
        insertFollowings(uid, following)
        print('.',end="")
    driver.close()

In [12]:
def crawlFollower(limit=10):
    driver = webdriver.Firefox()
    writers = unreadWriterid(limit)
    print ("\ncrawling writers ", len(writers))
    for writerid in writers :
        follower = crawlBrunchLink(writerid, dir='follower', driver=driver)
        insertFollowers(writerid, follower)
        print('.',end="")
    driver.close()

In [13]:
def crawlWriterInfo(limit=100):
    writers = unreadWriterInfoid(limit)
    print ("\ncrawling writer info ", len(writers))
    infos = []
    for writerid in writers :
        info = extractWriterInfo(writerid)
        if len(info)!=8:
            print("skipped:{id} {val}".format(id=writerid, val=info))
            continue
        infos.append(info)
        print('.',end="")
    insertWriterInfo(infos)

In [14]:
## following list crawling
for n in range(1,10):
    crawlFollowing(100)


crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................

In [20]:
crawlFollower(100)


crawling writers  100
....................................................................................................

In [21]:
## following list crawling
for n in range(1,100):
    crawlFollowing(100)


crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
....................................................................................................
crawling users  100
.........
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-84f901c97df7> in <module>()
      1 ## following list crawling
      2 for n in range(1,100):
----> 3     crawlFollowing(100)

<ipython-input-11-dc8a3e9c298f> in crawlFollowing(limit)
      4     print ("\ncrawling users ", len(users))
      5     for uid in users :
----> 6         following = crawlBrunchLink(uid, dir='following', driver=driver)
      7         insertFollowings(uid, following)
      8         print('.',end="")

<ipython-input-4-12f8f6067289> in crawlBrunchLink(uid, dir, driver)
      4     ## html crawling
      5     url = "https://brunch.co.kr/@{uid}/{dir}".format(uid=uid, dir=dir)
----> 6     driver.get(url)
      7 
      8     htmlsize = 0

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/selenium/webdriver/remote/webdriver.py in get(self, url)
    211         Loads a web page in the current browser session.
    212         """
--> 213         self.execute(Command.GET, {'url': url})
    214 
    215     @property

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
    197 
    198         params = self._wrap_value(params)
--> 199         response = self.command_executor.execute(driver_command, params)
    200         if response:
    201             self.error_handler.check_response(response)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/selenium/webdriver/remote/remote_connection.py in execute(self, command, params)
    393         path = string.Template(command_info[1]).substitute(params)
    394         url = '%s%s' % (self._url, path)
--> 395         return self._request(command_info[0], url, body=data)
    396 
    397     def _request(self, method, url, body=None):

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/selenium/webdriver/remote/remote_connection.py in _request(self, method, url, body)
    424             try:
    425                 self._conn.request(method, parsed_url.path, body, headers)
--> 426                 resp = self._conn.getresponse()
    427             except (httplib.HTTPException, socket.error):
    428                 self._conn.close()

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py in getresponse(self)
   1169 
   1170         try:
-> 1171             response.begin()
   1172             assert response.will_close != _UNKNOWN
   1173             self.__state = _CS_IDLE

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py in begin(self)
    349         # read until we get a non-100 response
    350         while True:
--> 351             version, status, reason = self._read_status()
    352             if status != CONTINUE:
    353                 break

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py in _read_status(self)
    311 
    312     def _read_status(self):
--> 313         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    314         if len(line) > _MAXLINE:
    315             raise LineTooLong("status line")

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py in readinto(self, b)
    372         while True:
    373             try:
--> 374                 return self._sock.recv_into(b)
    375             except timeout:
    376                 self._timeout_occurred = True

KeyboardInterrupt: 

In [ ]:
## writer info crawling
for n in range(1,2):
    crawlWriterInfo(100)

In [ ]:
## following list crawling
for n in range(1,100):
    crawlFollowing(100)

In [19]:
crawlFollowing(100)


crawling users  12
............

In [ ]: